-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[Clang] VectorExprEvaluator::VisitCallExpr / InterpretBuiltin - Allow GFNI intrinsics to be used in constexpr #169619
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
@RKSimon I extracted the common functions( |
|
Also, all tests passed on my machine I will work on the runtime vs compile time results comparison and post it here. |
|
@llvm/pr-subscribers-backend-x86 @llvm/pr-subscribers-clang Author: NagaChaitanya Vellanki (chaitanyav) ChangesResolves:#169295 Patch is 58.42 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/169619.diff 6 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 4aa3d51931980..3b404c6c8bb04 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -408,39 +408,21 @@ let Features = "avx512f,vaes", Attributes = [NoThrow, Const, RequiredVectorWidth
def aesdeclast512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>)">;
}
-let Features = "gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "gfni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
def vgf2p8affineinvqb_v16qi : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
-}
-
-let Features = "avx,gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def vgf2p8affineinvqb_v32qi : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant char)">;
-}
-
-let Features = "avx512f,gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
- def vgf2p8affineinvqb_v64qi : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, _Constant char)">;
-}
-
-let Features = "gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
def vgf2p8affineqb_v16qi : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant char)">;
-}
-
-let Features = "avx,gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
- def vgf2p8affineqb_v32qi : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant char)">;
-}
-
-let Features = "avx512f,gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
- def vgf2p8affineqb_v64qi : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, _Constant char)">;
-}
-
-let Features = "gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
def vgf2p8mulb_v16qi : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
}
-let Features = "avx,gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx,gfni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
+ def vgf2p8affineinvqb_v32qi : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant char)">;
+ def vgf2p8affineqb_v32qi : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant char)">;
def vgf2p8mulb_v32qi : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
}
-let Features = "avx512f,gfni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512f,gfni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
+ def vgf2p8affineinvqb_v64qi : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, _Constant char)">;
+ def vgf2p8affineqb_v64qi : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, _Constant char)">;
def vgf2p8mulb_v64qi : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
}
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 2ab40ac9cc89c..c7cf53e611ffb 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -3531,6 +3531,100 @@ static bool interp__builtin_ia32_shufbitqmb_mask(InterpState &S, CodePtr OpPC,
return true;
}
+static bool interp_builtin_ia32_gfni_affine(InterpState &S, CodePtr OpPC,
+ const CallExpr *Call,
+ bool Inverse) {
+ assert(Call->getNumArgs() == 3);
+ QualType XType = Call->getArg(0)->getType();
+ QualType AType = Call->getArg(1)->getType();
+ QualType ImmType = Call->getArg(2)->getType();
+ if (!XType->isVectorType() || !AType->isVectorType() ||
+ !ImmType->isIntegerType()) {
+ return false;
+ }
+
+ Pointer X, A;
+ APSInt Imm = popToAPSInt(S, Call->getArg(2));
+ A = S.Stk.pop<Pointer>();
+ X = S.Stk.pop<Pointer>();
+
+ const Pointer &Dst = S.Stk.peek<Pointer>();
+ const auto *XVecT = XType->castAs<VectorType>();
+ const auto *AVecT = AType->castAs<VectorType>();
+ assert(XVecT->getNumElements() == AVecT->getNumElements());
+ unsigned NumBytesInQWord = 8;
+ unsigned NumBytes = AVecT->getNumElements();
+ unsigned NumBitsInQWord = 64;
+ unsigned NumQWords = NumBytes / NumBytesInQWord;
+ unsigned NumBitsInByte = 8;
+ PrimType AElemT = *S.getContext().classify(AVecT->getElementType());
+
+ // computing A*X + Imm
+ for (unsigned QWordIdx = 0; QWordIdx != NumQWords; ++QWordIdx) {
+ // Extract the QWords from X, A
+ APInt XQWord(NumBitsInQWord, 0);
+ APInt AQWord(NumBitsInQWord, 0);
+ for (unsigned ByteIdx = 0; ByteIdx != NumBytesInQWord; ++ByteIdx) {
+ unsigned Idx = QWordIdx * NumBytesInQWord + ByteIdx;
+ uint8_t XByte;
+ uint8_t AByte;
+ INT_TYPE_SWITCH(AElemT, {
+ XByte = static_cast<uint8_t>(X.elem<T>(Idx));
+ AByte = static_cast<uint8_t>(A.elem<T>(Idx));
+ });
+
+ XQWord.insertBits(APInt(NumBitsInByte, XByte), ByteIdx * NumBitsInByte);
+ AQWord.insertBits(APInt(NumBitsInByte, AByte), ByteIdx * NumBitsInByte);
+ }
+
+ for (unsigned ByteIdx = 0; ByteIdx != NumBytesInQWord; ++ByteIdx) {
+ unsigned Idx = QWordIdx * NumBytesInQWord + ByteIdx;
+ uint8_t XByte =
+ XQWord.lshr(ByteIdx * NumBitsInByte).getLoBits(8).getZExtValue();
+ INT_TYPE_SWITCH(AElemT, {
+ Dst.elem<T>(Idx) = T::from(GFNIAffine(XByte, AQWord, Imm, Inverse));
+ });
+ }
+ }
+ Dst.initializeAllElements();
+ return true;
+}
+
+static bool interp__builtin_ia32_gfni_mul(InterpState &S, CodePtr OpPC,
+ const CallExpr *Call) {
+ assert(Call->getNumArgs() == 2);
+
+ QualType AType = Call->getArg(0)->getType();
+ QualType BType = Call->getArg(1)->getType();
+ if (!AType->isVectorType() || !BType->isVectorType()) {
+ return false;
+ }
+
+ Pointer A, B;
+ B = S.Stk.pop<Pointer>();
+ A = S.Stk.pop<Pointer>();
+
+ const Pointer &Dst = S.Stk.peek<Pointer>();
+ const auto *AVecT = AType->castAs<VectorType>();
+ const auto *BVecT = BType->castAs<VectorType>();
+ assert(AVecT->getNumElements() == BVecT->getNumElements());
+
+ PrimType AElemT = *S.getContext().classify(AVecT->getElementType());
+ unsigned NumBytes = A.getNumElems();
+
+ for (unsigned ByteIdx = 0; ByteIdx != NumBytes; ++ByteIdx) {
+ uint8_t AByte, BByte;
+ INT_TYPE_SWITCH(AElemT, {
+ AByte = static_cast<uint8_t>(A.elem<T>(ByteIdx));
+ BByte = static_cast<uint8_t>(B.elem<T>(ByteIdx));
+ Dst.elem<T>(ByteIdx) = T::from(GFNIMul(AByte, BByte));
+ });
+ }
+
+ Dst.initializeAllElements();
+ return true;
+}
+
bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
uint32_t BuiltinID) {
if (!S.getASTContext().BuiltinInfo.isConstantEvaluated(BuiltinID))
@@ -4545,6 +4639,21 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
return std::pair<unsigned, int>{SrcIdx,
static_cast<int>(LaneOffset + Index)};
});
+
+ case X86::BI__builtin_ia32_vgf2p8affineinvqb_v16qi:
+ case X86::BI__builtin_ia32_vgf2p8affineinvqb_v32qi:
+ case X86::BI__builtin_ia32_vgf2p8affineinvqb_v64qi:
+ return interp_builtin_ia32_gfni_affine(S, OpPC, Call, true);
+ case X86::BI__builtin_ia32_vgf2p8affineqb_v16qi:
+ case X86::BI__builtin_ia32_vgf2p8affineqb_v32qi:
+ case X86::BI__builtin_ia32_vgf2p8affineqb_v64qi:
+ return interp_builtin_ia32_gfni_affine(S, OpPC, Call, false);
+
+ case X86::BI__builtin_ia32_vgf2p8mulb_v16qi:
+ case X86::BI__builtin_ia32_vgf2p8mulb_v32qi:
+ case X86::BI__builtin_ia32_vgf2p8mulb_v64qi:
+ return interp__builtin_ia32_gfni_mul(S, OpPC, Call);
+
case X86::BI__builtin_ia32_insertps128:
return interp__builtin_ia32_shuffle_generic(
S, OpPC, Call, [](unsigned DstIdx, unsigned Mask) {
diff --git a/clang/lib/AST/ExprConstShared.h b/clang/lib/AST/ExprConstShared.h
index 401ae629c86bf..970c033bbf198 100644
--- a/clang/lib/AST/ExprConstShared.h
+++ b/clang/lib/AST/ExprConstShared.h
@@ -15,9 +15,12 @@
#define LLVM_CLANG_LIB_AST_EXPRCONSTSHARED_H
#include "clang/Basic/TypeTraits.h"
+#include <cstdint>
namespace llvm {
class APFloat;
+class APInt;
+class APSInt;
}
namespace clang {
class QualType;
@@ -74,4 +77,9 @@ void HandleComplexComplexDiv(llvm::APFloat A, llvm::APFloat B, llvm::APFloat C,
CharUnits GetAlignOfExpr(const ASTContext &Ctx, const Expr *E,
UnaryExprOrTypeTrait ExprKind);
+uint8_t GFNIMultiplicativeInverse(uint8_t Byte);
+uint8_t GFNIMul(uint8_t AByte, uint8_t BByte);
+uint8_t GFNIAffine(uint8_t XByte, llvm::APInt &AQword, llvm::APSInt Imm,
+ bool Inverse = false);
+
#endif
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 3b91678f7d400..fb6fe5aaad09f 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -13517,6 +13517,89 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
return Success(APValue(ResultElements.data(), ResultElements.size()), E);
}
+ case X86::BI__builtin_ia32_vgf2p8affineinvqb_v16qi:
+ case X86::BI__builtin_ia32_vgf2p8affineinvqb_v32qi:
+ case X86::BI__builtin_ia32_vgf2p8affineinvqb_v64qi:
+ case X86::BI__builtin_ia32_vgf2p8affineqb_v16qi:
+ case X86::BI__builtin_ia32_vgf2p8affineqb_v32qi:
+ case X86::BI__builtin_ia32_vgf2p8affineqb_v64qi: {
+
+ APValue X, A;
+ APSInt Imm;
+ if (!EvaluateAsRValue(Info, E->getArg(0), X) ||
+ !EvaluateAsRValue(Info, E->getArg(1), A) ||
+ !EvaluateInteger(E->getArg(2), Imm, Info))
+ return false;
+
+ assert(X.isVector() && A.isVector());
+ assert(X.getVectorLength() == A.getVectorLength());
+
+ bool IsInverse = false;
+ switch (E->getBuiltinCallee()) {
+ case X86::BI__builtin_ia32_vgf2p8affineinvqb_v16qi:
+ case X86::BI__builtin_ia32_vgf2p8affineinvqb_v32qi:
+ case X86::BI__builtin_ia32_vgf2p8affineinvqb_v64qi: {
+ IsInverse = true;
+ }
+ }
+
+ unsigned NumBitsInByte = 8;
+ unsigned NumBytesInQWord = 8;
+ unsigned NumBitsInQWord = 64;
+ unsigned NumBytes = A.getVectorLength();
+ unsigned NumQWords = NumBytes / NumBytesInQWord;
+ SmallVector<APValue, 64> Result;
+ Result.reserve(NumBytes);
+
+ // computing A*X + Imm
+ for (unsigned QWordIdx = 0; QWordIdx != NumQWords; ++QWordIdx) {
+ // Extract the QWords from X, A
+ APInt XQWord(NumBitsInQWord, 0);
+ APInt AQWord(NumBitsInQWord, 0);
+ for (unsigned ByteIdx = 0; ByteIdx != NumBytesInQWord; ++ByteIdx) {
+ unsigned Idx = QWordIdx * NumBytesInQWord + ByteIdx;
+ APInt XByte = X.getVectorElt(Idx).getInt();
+ APInt AByte = A.getVectorElt(Idx).getInt();
+ XQWord.insertBits(XByte, ByteIdx * NumBitsInByte);
+ AQWord.insertBits(AByte, ByteIdx * NumBitsInByte);
+ }
+
+ for (unsigned ByteIdx = 0; ByteIdx != NumBytesInQWord; ++ByteIdx) {
+ uint8_t XByte =
+ XQWord.lshr(ByteIdx * NumBitsInByte).getLoBits(8).getZExtValue();
+ Result.push_back(APValue(APSInt(
+ APInt(8, GFNIAffine(XByte, AQWord, Imm, IsInverse)), false)));
+ }
+ }
+
+ return Success(APValue(Result.data(), Result.size()), E);
+ }
+
+ case X86::BI__builtin_ia32_vgf2p8mulb_v16qi:
+ case X86::BI__builtin_ia32_vgf2p8mulb_v32qi:
+ case X86::BI__builtin_ia32_vgf2p8mulb_v64qi: {
+ APValue A, B;
+ if (!EvaluateAsRValue(Info, E->getArg(0), A) ||
+ !EvaluateAsRValue(Info, E->getArg(1), B))
+ return false;
+
+ assert(A.isVector() && B.isVector());
+ assert(A.getVectorLength() == B.getVectorLength());
+
+ unsigned NumBytes = A.getVectorLength();
+ SmallVector<APValue, 64> Result;
+ Result.reserve(NumBytes);
+
+ for (unsigned ByteIdx = 0; ByteIdx != NumBytes; ++ByteIdx) {
+ uint8_t AByte = A.getVectorElt(ByteIdx).getInt().getZExtValue();
+ uint8_t BByte = B.getVectorElt(ByteIdx).getInt().getZExtValue();
+ Result.push_back(APValue(
+ APSInt(APInt(8, GFNIMul(AByte, BByte)), /*IsUnsigned=*/false)));
+ }
+
+ return Success(APValue(Result.data(), Result.size()), E);
+ }
+
case X86::BI__builtin_ia32_insertf32x4_256:
case X86::BI__builtin_ia32_inserti32x4_256:
case X86::BI__builtin_ia32_insertf64x2_256:
@@ -19008,6 +19091,87 @@ bool ComplexExprEvaluator::VisitCastExpr(const CastExpr *E) {
llvm_unreachable("unknown cast resulting in complex value");
}
+uint8_t GFNIMultiplicativeInverse(uint8_t Byte) {
+ // Lookup Table for Multiplicative Inverse in GF(2^8)
+ const uint8_t GFInv[256] = {
+ 0x00, 0x01, 0x8d, 0xf6, 0xcb, 0x52, 0x7b, 0xd1, 0xe8, 0x4f, 0x29, 0xc0,
+ 0xb0, 0xe1, 0xe5, 0xc7, 0x74, 0xb4, 0xaa, 0x4b, 0x99, 0x2b, 0x60, 0x5f,
+ 0x58, 0x3f, 0xfd, 0xcc, 0xff, 0x40, 0xee, 0xb2, 0x3a, 0x6e, 0x5a, 0xf1,
+ 0x55, 0x4d, 0xa8, 0xc9, 0xc1, 0x0a, 0x98, 0x15, 0x30, 0x44, 0xa2, 0xc2,
+ 0x2c, 0x45, 0x92, 0x6c, 0xf3, 0x39, 0x66, 0x42, 0xf2, 0x35, 0x20, 0x6f,
+ 0x77, 0xbb, 0x59, 0x19, 0x1d, 0xfe, 0x37, 0x67, 0x2d, 0x31, 0xf5, 0x69,
+ 0xa7, 0x64, 0xab, 0x13, 0x54, 0x25, 0xe9, 0x09, 0xed, 0x5c, 0x05, 0xca,
+ 0x4c, 0x24, 0x87, 0xbf, 0x18, 0x3e, 0x22, 0xf0, 0x51, 0xec, 0x61, 0x17,
+ 0x16, 0x5e, 0xaf, 0xd3, 0x49, 0xa6, 0x36, 0x43, 0xf4, 0x47, 0x91, 0xdf,
+ 0x33, 0x93, 0x21, 0x3b, 0x79, 0xb7, 0x97, 0x85, 0x10, 0xb5, 0xba, 0x3c,
+ 0xb6, 0x70, 0xd0, 0x06, 0xa1, 0xfa, 0x81, 0x82, 0x83, 0x7e, 0x7f, 0x80,
+ 0x96, 0x73, 0xbe, 0x56, 0x9b, 0x9e, 0x95, 0xd9, 0xf7, 0x02, 0xb9, 0xa4,
+ 0xde, 0x6a, 0x32, 0x6d, 0xd8, 0x8a, 0x84, 0x72, 0x2a, 0x14, 0x9f, 0x88,
+ 0xf9, 0xdc, 0x89, 0x9a, 0xfb, 0x7c, 0x2e, 0xc3, 0x8f, 0xb8, 0x65, 0x48,
+ 0x26, 0xc8, 0x12, 0x4a, 0xce, 0xe7, 0xd2, 0x62, 0x0c, 0xe0, 0x1f, 0xef,
+ 0x11, 0x75, 0x78, 0x71, 0xa5, 0x8e, 0x76, 0x3d, 0xbd, 0xbc, 0x86, 0x57,
+ 0x0b, 0x28, 0x2f, 0xa3, 0xda, 0xd4, 0xe4, 0x0f, 0xa9, 0x27, 0x53, 0x04,
+ 0x1b, 0xfc, 0xac, 0xe6, 0x7a, 0x07, 0xae, 0x63, 0xc5, 0xdb, 0xe2, 0xea,
+ 0x94, 0x8b, 0xc4, 0xd5, 0x9d, 0xf8, 0x90, 0x6b, 0xb1, 0x0d, 0xd6, 0xeb,
+ 0xc6, 0x0e, 0xcf, 0xad, 0x08, 0x4e, 0xd7, 0xe3, 0x5d, 0x50, 0x1e, 0xb3,
+ 0x5b, 0x23, 0x38, 0x34, 0x68, 0x46, 0x03, 0x8c, 0xdd, 0x9c, 0x7d, 0xa0,
+ 0xcd, 0x1a, 0x41, 0x1c};
+
+ return GFInv[Byte];
+}
+
+uint8_t GFNIAffine(uint8_t XByte, APInt &AQword, APSInt Imm, bool Inverse) {
+ unsigned NumBitsInByte = 8;
+ // Computing the affine transformation
+ uint8_t RetByte = 0;
+ for (uint32_t BitIdx = 0; BitIdx != NumBitsInByte; ++BitIdx) {
+ uint8_t AByte =
+ AQword.lshr((7 - static_cast<int32_t>(BitIdx)) * NumBitsInByte)
+ .getLoBits(8)
+ .getZExtValue();
+ uint8_t Product;
+ if (Inverse) {
+ Product = AByte & GFNIMultiplicativeInverse(XByte);
+ } else {
+ Product = AByte & XByte;
+ }
+ uint8_t Parity = 0;
+
+ // Dot product in GF(2) uses XOR instead of addition
+ for (unsigned PBitIdx = 0; PBitIdx != NumBitsInByte; ++PBitIdx) {
+ Parity = Parity ^ ((Product >> PBitIdx) & 0x1);
+ }
+
+ uint8_t Temp = Imm[BitIdx] ? 1 : 0;
+ RetByte |= (Temp ^ Parity) << BitIdx;
+ }
+ return RetByte;
+}
+
+uint8_t GFNIMul(uint8_t AByte, uint8_t BByte) {
+ // Multiplying two polynomials of degree 7
+ // Polynomial of degree 7
+ // x^7 + x^6 + x^5 + x^4 + x^3 + x^2 + x + 1
+ uint16_t TWord = 0;
+ unsigned NumBitsInByte = 8;
+ for (unsigned BitIdx = 0; BitIdx != NumBitsInByte; ++BitIdx) {
+ if ((BByte >> BitIdx) & 0x1) {
+ TWord = TWord ^ (AByte << BitIdx);
+ }
+ }
+
+ // When multiplying two polynomials of degree 7
+ // results in a polynomial of degree 14
+ // so the result has to be reduced to 7
+ // Reduction polynomial is x^8 + x^4 + x^3 + x + 1 i.e. 0x11B
+ for (int32_t BitIdx = 14; BitIdx > 7; --BitIdx) {
+ if ((TWord >> BitIdx) & 0x1) {
+ TWord = TWord ^ (0x11B << (BitIdx - 8));
+ }
+ }
+ return (TWord & 0xFF);
+}
+
void HandleComplexComplexMul(APFloat A, APFloat B, APFloat C, APFloat D,
APFloat &ResR, APFloat &ResI) {
// This is an implementation of complex multiplication according to the
diff --git a/clang/lib/Headers/gfniintrin.h b/clang/lib/Headers/gfniintrin.h
index 1df1eace63759..2c559f13c6577 100644
--- a/clang/lib/Headers/gfniintrin.h
+++ b/clang/lib/Headers/gfniintrin.h
@@ -15,6 +15,35 @@
#define __GFNIINTRIN_H
/* Default attributes for simple form (no masking). */
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS \
+ __attribute__((__always_inline__, __nodebug__, __target__("gfni"), \
+ __min_vector_width__(128))) constexpr
+
+/* Default attributes for YMM unmasked form. */
+#define __DEFAULT_FN_ATTRS_Y \
+ __attribute__((__always_inline__, __nodebug__, __target__("avx,gfni"), \
+ __min_vector_width__(256))) constexpr
+
+/* Default attributes for VLX masked forms. */
+#define __DEFAULT_FN_ATTRS_VL128 \
+ __attribute__((__always_inline__, __nodebug__, \
+ __target__("avx512bw,avx512vl,gfni"), \
+ __min_vector_width__(128))) constexpr
+#define __DEFAULT_FN_ATTRS_VL256 \
+ __attribute__((__always_inline__, __nodebug__, \
+ __target__("avx512bw,avx512vl,gfni"), \
+ __min_vector_width__(256))) constexpr
+
+/* Default attributes for ZMM unmasked forms. */
+#define __DEFAULT_FN_ATTRS_Z \
+ __attribute__((__always_inline__, __nodebug__, __target__("avx512f,gfni"), \
+ __min_vector_width__(512))) constexpr
+/* Default attributes for ZMM masked forms. */
+#define __DEFAULT_FN_ATTRS_Z_MASK \
+ __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni"), \
+ __min_vector_width__(512))) constexpr
+#else
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("gfni"), \
__min_vector_width__(128)))
@@ -42,6 +71,7 @@
#define __DEFAULT_FN_ATTRS_Z_MASK \
__attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni"), \
__min_vector_width__(512)))
+#endif
#define _mm_gf2p8affineinv_epi64_epi8(A, B, I) \
((__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A), \
diff --git a/clang/test/CodeGen/X86/gfni-builtins.c b/clang/test/CodeGen/X86/gfni-builtins.c
index 7f196e08f4d80..0f1b62fb2c48f 100644
--- a/clang/test/CodeGen/X86/gfni-builtins.c
+++ b/clang/test/CodeGen/X86/gfni-builtins.c
@@ -1,43 +1,54 @@
-// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +gfni -emit-llvm -o - | FileCheck %s --check-prefix SSE
-// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +gfni -target-feature +avx -emit-llvm -o - | FileCheck %s --check-prefixes SSE,AVX
-// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +gfni -target-feature +avx512f -target-feature +avx512vl -emit-llvm -o - | FileCheck %s --check-prefixes SSE,AVX,AVX512
-// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +gfni -target-feature +avx512bw -target-feature +avx512vl -emit-llvm -o - | FileCheck %s --check-prefixes SSE,AVX,AVX512,AVX512BW
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +gfni -emit-llvm -o - | FileCheck %s --check-prefix SSE
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +gfni -target-feature +avx -emit-llvm -o - | FileCheck %s --check-prefixes SSE,AVX
+// RUN: %c...
[truncated]
|
clang/lib/AST/ExprConstShared.h
Outdated
|
|
||
| uint8_t GFNIMultiplicativeInverse(uint8_t Byte); | ||
| uint8_t GFNIMul(uint8_t AByte, uint8_t BByte); | ||
| uint8_t GFNIAffine(uint8_t XByte, llvm::APInt &AQword, llvm::APSInt Imm, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| uint8_t GFNIAffine(uint8_t XByte, llvm::APInt &AQword, llvm::APSInt Imm, | |
| uint8_t GFNIAffine(uint8_t XByte, const llvm::APInt &AQword, const llvm::APInt &Imm, |
| CharUnits GetAlignOfExpr(const ASTContext &Ctx, const Expr *E, | ||
| UnaryExprOrTypeTrait ExprKind); | ||
|
|
||
| uint8_t GFNIMultiplicativeInverse(uint8_t Byte); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would we be better off putting these in APIntOps or MathExtras ? @tbaederr - thoughts?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is that needed anywhere else? In regular codegen maybe?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
x86 (middleend/backend) might end up with some additional gfni folds at some point, I don't know if any other targets have an identical instruction
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
please let me know the appropriate place for the helper functions, i will move them to the preferred location.
… GFNI intrinsics to be used in constexpr Resolves:llvm#169295
4ce7ac1 to
62caae7
Compare
|
attaching the fuzz results for GFNI inttrinsics |
Resolves:#169295